R Markdown

Attempting to do some problem solving.

  1. Read in the gapminder_clean.csv data as a tibble using read_csv.

  2. Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap for the filtered data.

data <- read.csv("C:/Users/User/Desktop/Github/RforDataScience/gapminder_clean.csv")
y1962 <- data %>% 
  filter(Year == 1962) %>% 
  filter(!is.na(gdpPercap)) %>%
  filter(!is.na(CO2.emissions..metric.tons.per.capita.)) %>%
  as_tibble()

head(y1962)
## # A tibble: 6 × 20
##       X Country.Name  Year Agriculture..value.added....…¹ CO2.emissions..metri…²
##   <int> <chr>        <int>                          <dbl>                  <dbl>
## 1     0 Afghanistan   1962                             NA                 0.0738
## 2    10 Albania       1962                             NA                 1.44  
## 3    20 Algeria       1962                             NA                 0.485 
## 4    50 Angola        1962                             NA                 0.216 
## 5    80 Argentina     1962                             NA                 2.52  
## 6   110 Australia     1962                             NA                 8.84  
## # ℹ abbreviated names: ¹​Agriculture..value.added....of.GDP.,
## #   ²​CO2.emissions..metric.tons.per.capita.
## # ℹ 15 more variables:
## #   Domestic.credit.provided.by.financial.sector....of.GDP. <dbl>,
## #   Electric.power.consumption..kWh.per.capita. <dbl>,
## #   Energy.use..kg.of.oil.equivalent.per.capita. <dbl>,
## #   Exports.of.goods.and.services....of.GDP. <dbl>, …
plot1962 <- ggplot(y1962, aes(x=CO2.emissions..metric.tons.per.capita., y=gdpPercap))+
  geom_point()

ggplotly(plot1962)
  1. On the filtered data, calculate the correlation of ‘CO2 emissions (metric tons per capita)’ and gdpPercap. What is the correlation and associated p value?
gdpvar <- y1962["gdpPercap"][[1]]
co2var <- y1962["CO2.emissions..metric.tons.per.capita."][[1]]
cor.test(co2var, gdpvar, 
         method="pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  co2var and gdpvar
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8934697 0.9489792
## sample estimates:
##       cor 
## 0.9260817

For calculating the correlation between two continuous variables, Pearson’s product-moment correlation seemed appropriate.

  1. On the unfiltered data, answer “In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?” Filter the dataset to that year for the next step…
cordata <- data.frame()

years <- data %>% 
  filter(!is.na(gdpPercap)) %>%
  group_by(Year) %>% summarise(avg=mean(gdpPercap))
## used for counting how many years.


for (i in 1:lengths(years[1])){

  cyear <- years[i,1][[1]]
  moddata <- data %>%
    filter(Year == cyear)
  gdpvar <- moddata["gdpPercap"][[1]]
  co2var <- moddata["CO2.emissions..metric.tons.per.capita."][[1]]
  cor <- cor.test(co2var, gdpvar, method="pearson")[[4]]
  pval <- cor.test(co2var, gdpvar, method="pearson")[[3]]
  
  corresult <- data.frame(year = c(cyear), correlation=c(cor), 'p value'=c(pval))
  
  cordata <- rbind(cordata,corresult)
    
}
rownames(cordata) <- NULL

cordata <- cordata %>% arrange(desc(correlation))

cordata
##    year correlation      p.value
## 1  1967   0.9387918 3.397143e-53
## 2  1962   0.9260817 1.128679e-46
## 3  1972   0.8428986 1.824292e-32
## 4  1982   0.8166384 5.565916e-29
## 5  1987   0.8095531 3.899627e-28
## 6  1992   0.8094316 1.610614e-29
## 7  1997   0.8081396 7.976156e-30
## 8  2002   0.8006421 3.863564e-29
## 9  1977   0.7928336 2.838892e-26
## 10 2007   0.7204169 9.232747e-22
  1. Using plotly, create an interactive scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.
y1967 <-  data %>% filter(Year == 1967) %>% filter(!is.na(gdpPercap))

plot67 <- ggplot(y1967, aes(x=CO2.emissions..metric.tons.per.capita., y=gdpPercap, size=pop, color=continent))+
  geom_point()+
  xlab("CO2 emissions (metric tons per capita)")+
  ylab("GDP per capita")+
  labs(title="Correlation between CO2 emission and GDP per capita", size = "")


ggplotly(plot67)
  1. What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’? (stats test needed)
cont_energy <- data %>%
  filter(!is.na(Electric.power.consumption..kWh.per.capita.)) %>%
  filter(continent!="") %>%
  subset(select = c("Electric.power.consumption..kWh.per.capita.", "continent"))

elec<- cont_energy$Electric.power.consumption..kWh.per.capita.

cont<- cont_energy$continent

africa_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Africa"]

america_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Americas"]

asia_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Asia"]

europe_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Europe"]

oceania_energy <- cont_energy$Electric.power.consumption..kWh.per.capita.[cont_energy$continent=="Oceania"]

shapiro.test(africa_energy)
## 
##  Shapiro-Wilk normality test
## 
## data:  africa_energy
## W = 0.59218, p-value < 2.2e-16
kruskal.test(Electric.power.consumption..kWh.per.capita.~continent, data=cont_energy)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Electric.power.consumption..kWh.per.capita. by continent
## Kruskal-Wallis chi-squared = 353.23, df = 4, p-value < 2.2e-16
pconsume <- ggplot(cont_energy, aes(x=continent,y=Electric.power.consumption..kWh.per.capita.))+
  geom_boxplot()+
  ylab("Electric power consumption kWh per capita")+
  xlab("Continents")
  
ggplotly(pconsume)
library(dunn.test)
dunn.test(elec, cont, method = "bonferroni")
##   Kruskal-Wallis rank sum test
## 
## data: elec and cont
## Kruskal-Wallis chi-squared = 353.2299, df = 4, p-value = 0
## 
## 
##                           Comparison of elec by cont                           
##                                  (Bonferroni)                                  
## Col Mean-|
## Row Mean |     Africa   Americas       Asia     Europe
## ---------+--------------------------------------------
## Americas |  -6.474696
##          |    0.0000*
##          |
##     Asia |  -5.544423   0.906730
##          |    0.0000*     1.0000
##          |
##   Europe |  -17.28571  -10.36770  -11.29266
##          |    0.0000*    0.0000*    0.0000*
##          |
##  Oceania |  -8.703935  -5.861469  -6.255830  -1.648792
##          |    0.0000*    0.0000*    0.0000*     0.4960
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

There is no correlation test for a continuous variable and a nominal variable. Therefore, I checked whether there were statistically significant differences among the continents.

Kruskal Wallis test was appropriate as it did not pass normality test. Further analysis were performed using Dunn’s test.

Differences in electrical power consumption among different continents are statistically significant. (Kruskal Wallis test / Dunn’s test)

While there were significant differences among the continents, there were 2 exceptions :

No significant difference between Asia’s power consumption and that of Americas were found.

No significant difference between Europe’s power consumption and that of Oceania were found.

  1. Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990? (stats test needed)
AsEuImport <- data %>% 
  filter(Year > 1990) %>%
  filter(((continent=="Europe") | (continent=="Asia"))) %>%
  subset(select= c("continent","Imports.of.goods.and.services....of.GDP.")) %>%
  filter(!is.na(Imports.of.goods.and.services....of.GDP.))

shapiro.test(AsEuImport$Imports.of.goods.and.services....of.GDP.[AsEuImport$continent=="Asia"])
## 
##  Shapiro-Wilk normality test
## 
## data:  AsEuImport$Imports.of.goods.and.services....of.GDP.[AsEuImport$continent == "Asia"]
## W = 0.8549, p-value = 2.31e-08
## shapiro test failed

wilcox.test(Imports.of.goods.and.services....of.GDP.~continent, data=AsEuImport)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Imports.of.goods.and.services....of.GDP. by continent
## W = 5707, p-value = 0.7867
## alternative hypothesis: true location shift is not equal to 0

The data failed to pass normality test. Therefore Wilcox test were used.

There is no statistical difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990.

  1. What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)
mdensity <- data %>%
  group_by(Year) %>%
  slice_max(Population.density..people.per.sq..km.of.land.area., n=1) %>%
  subset(select=c("Year", "Country.Name", "Population.density..people.per.sq..km.of.land.area."))

mdensity
## # A tibble: 10 × 3
## # Groups:   Year [10]
##     Year Country.Name     Population.density..people.per.sq..km.of.land.area.
##    <int> <chr>                                                          <dbl>
##  1  1962 Monaco                                                        11521 
##  2  1967 Monaco                                                        11648.
##  3  1972 Macao SAR, China                                              12714.
##  4  1977 Monaco                                                        12904.
##  5  1982 Monaco                                                        13814.
##  6  1987 Macao SAR, China                                              16133.
##  7  1992 Macao SAR, China                                              18890.
##  8  1997 Macao SAR, China                                              20602.
##  9  2002 Macao SAR, China                                              16451.
## 10  2007 Monaco                                                        17523
mdensity2 <- data %>%
  group_by(Year) %>%
  slice_max(Population.density..people.per.sq..km.of.land.area., n=8) %>%
  subset(select=c("Year", "Country.Name", "Population.density..people.per.sq..km.of.land.area."))

mdenplot <- ggplot(mdensity2, aes(x=Year, y=Population.density..people.per.sq..km.of.land.area., color=Country.Name))+
  geom_point()

ggplotly(mdenplot)

Monaco and China have the highest ‘Population density (people per sq. km of land area)’ across all years.

  1. What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ between 1962 and 2007?
birth1962 <- data %>%
  filter(Year==1962) %>%
  subset(select=c("Country.Name", "Life.expectancy.at.birth..total..years.")) %>%
  rename('Life expectancy 1962' = Life.expectancy.at.birth..total..years.)

birth2007 <- data %>%
  filter(Year==2007) %>%
  subset(select= c("Country.Name", "Life.expectancy.at.birth..total..years.")) %>%
  rename('Life expectancy 2007' = Life.expectancy.at.birth..total..years.)

merged <- merge(birth1962,birth2007, by = "Country.Name") %>%
  mutate(`Life expectancy increase` = `Life expectancy 2007` - `Life expectancy 1962`) %>%
  arrange(desc(`Life expectancy increase`))

head(merged)
##   Country.Name Life expectancy 1962 Life expectancy 2007
## 1     Maldives             38.48356             75.39971
## 2       Bhutan             33.09415             66.29310
## 3  Timor-Leste             34.73905             65.82420
## 4      Tunisia             43.34168             74.20244
## 5         Oman             44.30051             75.12361
## 6        Nepal             35.95229             66.55193
##   Life expectancy increase
## 1                 36.91615
## 2                 33.19895
## 3                 31.08515
## 4                 30.86076
## 5                 30.82310
## 6                 30.59963